# Data manipulation and visualization
library(tidyverse)
library(here)
library(janitor)
library(patchwork)
#library(grid)
#library(gridExtra)
#library(ghibli)
#library(RColorBrewer)
#library(gt)
#library(gtExtras)
# Data analysis/statistics
library(car) #ANOVA
library(ggeffects)
library(effects) #dependency for ggeffects
library(ggResidpanel) #residual panel
library(GGally)
library(arm) #discrete.histogram() function
library(MASS) #required for arm package
dispersion <- function (model){
sum(residuals(model, type = "pearson")^2)/(length(model$y)-length(model$coefficients))
}
The variables with are interested in are: - turf algae (column ‘ta’) - hard coral (column ‘hard_coral’) - sand (column ‘sand’) - crustose coralline algae (column ‘cca’)
fish <- read_csv(here("HW5","data","CRCP_Reef_Fish_Surveys_kole.csv"))
fish1 <- fish[,c("ta","hard_coral", "sand", "cca")]
ggpairs(fish1)
For each of the four predictors, make single-predictor models where the response variable is kole counts. Consider which of the predictors could be transformed to reduce skew along the x-axis. For educational purposes, start by using a poisson distribution for the counts.
#Based on the frequency distribution of the turf algae (ta) data, the predictor does not seem to be suited for the Poisson distribution.
par(mfrow = c(1,2))
discrete.histogram(fish$count)
discrete.histogram(fish$ta)
ta_lm <- lm(count ~ ta, data = fish)
summary(ta_lm)
##
## Call:
## lm(formula = count ~ ta, data = fish)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.110 -11.896 -6.206 4.173 111.395
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.11012 3.22407 9.029 < 2e-16 ***
## ta -0.26263 0.05234 -5.018 8.85e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.31 on 308 degrees of freedom
## Multiple R-squared: 0.07557, Adjusted R-squared: 0.07257
## F-statistic: 25.18 on 1 and 308 DF, p-value: 8.854e-07
resid_panel(ta_lm, plots = c("resid", "qq", "lev", "hist"))
ta_glm <- glm(count ~ ta, family= poisson, data = fish)
summary(ta_glm)
##
## Call:
## glm(formula = count ~ ta, family = poisson, data = fish)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.5784202 0.0352931 101.4 <2e-16 ***
## ta -0.0177069 0.0006581 -26.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 7573.1 on 309 degrees of freedom
## Residual deviance: 6862.4 on 308 degrees of freedom
## AIC: 7783.4
##
## Number of Fisher Scoring iterations: 6
resid_panel(ta_glm, plots = c("resid", "qq", "lev", "hist"))
resid_xpanel(ta_glm)
# lecture notes suggest using plot(log.y = TRUE) but I get an error.
ggeff <- ggeffect(ta_glm) %>% plot()+ scale_y_continuous(trans = "log")
ggeff
plot(allEffects(ta_glm))
Anova(ta_glm)
## Analysis of Deviance Table (Type II tests)
##
## Response: count
## LR Chisq Df Pr(>Chisq)
## ta 710.68 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
discrete.histogram(fish$ta, prob.col = 'orange', xlim = c(0, 100), ylim = c(0,0.18))
lines(0:25, dpois(0:25, lambda = mean(fish$count)), col = 'blue',
lwd = 4, type = 'b')
legend(12, 0.15, lwd = 4, col = 'blue', lty = 1, legend = "Poisson
prediction")
Overdispersion: Poisson and Binomial distributions
makes assumptions on the variability of the data which is used for the
calculations of likelihood. Larger variability will result in smaller
Confidence Intervals (CI) and p-values, which is inaccurate.
With a dispersion of 30.10 (>>>1), the data is over dispersed
and it will result in smaller CI and p-values than it should.
dispersion(ta_glm)
## [1] 30.10215
#Based on the frequency distribution of the hard coral data, the predictor seem to be suited for the Poisson distribution.
par(mfrow = c(1,2))
discrete.histogram(fish$count)
discrete.histogram(fish$hard_coral)
coral_lm <- lm(count ~ hard_coral, data = fish)
summary(coral_lm)
##
## Call:
## lm(formula = count ~ hard_coral, data = fish)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.440 -9.619 -6.177 3.039 111.774
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.31592 1.69524 3.136 0.00188 **
## hard_coral 0.43034 0.06312 6.818 4.88e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.69 on 308 degrees of freedom
## Multiple R-squared: 0.1311, Adjusted R-squared: 0.1283
## F-statistic: 46.49 on 1 and 308 DF, p-value: 4.876e-11
resid_panel(coral_lm, plots = c("resid", "qq", "lev", "hist"))
coral_glm <- glm(count ~ hard_coral, family= poisson, data = fish)
summary(coral_glm)
##
## Call:
## glm(formula = count ~ hard_coral, family = poisson, data = fish)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.0379324 0.0257021 79.29 <2e-16 ***
## hard_coral 0.0243714 0.0006953 35.05 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 7573.1 on 309 degrees of freedom
## Residual deviance: 6481.7 on 308 degrees of freedom
## AIC: 7402.7
##
## Number of Fisher Scoring iterations: 6
resid_panel(coral_glm, plots = c("resid", "qq", "lev", "hist"))
resid_xpanel(coral_glm)
# lecture notes suggest using plot(log.y = TRUE) but I get an error.
ggeff <- ggeffect(coral_glm) %>% plot()+ scale_y_continuous(trans = "log")
ggeff
plot(allEffects(coral_glm))
Anova(coral_glm)
## Analysis of Deviance Table (Type II tests)
##
## Response: count
## LR Chisq Df Pr(>Chisq)
## hard_coral 1091.4 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
discrete.histogram(fish$hard_coral, prob.col = 'orange', xlim = c(0, 75))
lines(0:25, dpois(0:25, lambda = mean(fish$count)), col = 'blue',
lwd = 4, type = 'b')
legend(12, 0.20, lwd = 4, col = 'blue', lty = 1, legend = "Poisson
prediction")
dispersion(coral_glm)
## [1] 28.13073
With a dispersion of 28.13 (>>>1), the data is over dispersed and it will result in smaller CI and p-values than it should.
#Based on the frequency distribution of the hard coral data, the predictor seem to be suited for the Poisson distribution.
par(mfrow = c(1,2))
discrete.histogram(fish$count)
discrete.histogram(fish$sand)
coral_lm <- lm(count ~ sand, data = fish)
summary(coral_lm)
resid_panel(coral_lm, plots = c("resid", "qq", "lev", "hist"))
sand_glm <- glm(count ~ sand, family= poisson, data = fish)
summary(sand_glm)
##
## Call:
## glm(formula = count ~ sand, family = poisson, data = fish)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.946727 0.018964 155.39 <2e-16 ***
## sand -0.035770 0.001685 -21.22 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 7573.1 on 309 degrees of freedom
## Residual deviance: 6998.7 on 308 degrees of freedom
## AIC: 7919.6
##
## Number of Fisher Scoring iterations: 6
resid_panel(sand_glm, plots = c("resid", "qq", "lev", "hist"))
resid_xpanel(sand_glm)
# lecture notes suggest using plot(log.y = TRUE) but I get an error.
ggeff <- ggeffect(sand_glm) %>% plot()+ scale_y_continuous(trans = "log")
ggeff
plot(allEffects(sand_glm))
Anova(sand_glm)
## Analysis of Deviance Table (Type II tests)
##
## Response: count
## LR Chisq Df Pr(>Chisq)
## sand 574.42 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
discrete.histogram(fish$sand, prob.col = 'orange', xlim = c(0, 75))
lines(0:25, dpois(0:25, lambda = mean(fish$count)), col = 'blue',
lwd = 4, type = 'b')
legend(12, 0.20, lwd = 4, col = 'blue', lty = 1, legend = "Poisson
prediction")
dispersion(sand_glm)
## [1] 30.97697
With a dispersion of 30.98 (>>>1), the data is over dispersed and it will result in smaller CI and p-values than it should.
#Based on the frequency distribution of the hard coral data, the predictor seem to be suited for the Poisson distribution.
par(mfrow = c(1,2))
discrete.histogram(fish$count)
discrete.histogram(fish$cca)
cca_lm <- lm(count ~ cca, data = fish)
summary(coral_lm)
resid_panel(coral_lm, plots = c("resid", "qq", "lev", "hist"))
cca_glm <- glm(count ~ cca, family= poisson, data = fish)
summary(cca_glm)
##
## Call:
## glm(formula = count ~ cca, family = poisson, data = fish)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.270516 0.021449 105.85 <2e-16 ***
## cca 0.034430 0.001153 29.85 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 7573.1 on 309 degrees of freedom
## Residual deviance: 6846.7 on 308 degrees of freedom
## AIC: 7767.6
##
## Number of Fisher Scoring iterations: 6
resid_panel(cca_glm, plots = c("resid", "qq", "lev", "hist"))
resid_xpanel(cca_glm)
# lecture notes suggest using plot(log.y = TRUE) but I get an error.
ggeff <- ggeffect(cca_glm) %>% plot()+ scale_y_continuous(trans = "log")
ggeff
plot(allEffects(cca_glm))
Anova(cca_glm)
## Analysis of Deviance Table (Type II tests)
##
## Response: count
## LR Chisq Df Pr(>Chisq)
## cca 726.46 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
discrete.histogram(fish$cca, prob.col = 'orange', xlim = c(0, 75))
lines(0:25, dpois(0:25, lambda = mean(fish$count)), col = 'blue',
lwd = 4, type = 'b')
legend(12, 0.20, lwd = 4, col = 'blue', lty = 1, legend = "Poisson
prediction")
dispersion(cca_glm)
## [1] 30.8293
With a dispersion of 30.83 (>>>1), the data is over dispersed and it will result in smaller CI and p-values than it should.
To deal with over-dispersion, we can use a negative binomial distribution or use random effects.